Apache Spark MLlib


In [1]:
import org.apache.spark._
import org.apache.spark.rdd._
import org.apache.spark.mllib.classification._
import org.apache.spark.mllib.regression._
import org.apache.spark.mllib.linalg._

In [2]:
println(sc.version)

In [3]:
def loadCsv(path: String): RDD[LabeledPoint] = {
    import scala.util.Try

    sc.textFile(path).flatMap { line =>
        Try {
            val row = line.split(",").map { _.toInt }
            LabeledPoint(
                label = row.head,
                features = new DenseVector(row.tail.map { _.toDouble })
            )
        }.toOption
    }
}

In [4]:
val Array(train, test) = loadCsv("mnist_train.csv").randomSplit(Array(0.8, 0.2), seed=333L)

/** We will use training and test datsets more than once.
 *   To avoid recalculation they should be persisted (cached).
 */
train.persist()
test.persist()


Out[4]:
MapPartitionsRDD[5] at randomSplit at <console>:31

Logistic regression


In [5]:
/** Default logistic regression. **/
val logreg = new LogisticRegressionWithLBFGS().setNumClasses(10)

In [6]:
/** The same could be done by `LogisticRegression.train` methods **/
val trainedModel = logreg.run(train)

In [7]:
/** Construction `arg: { def f(...): ... }` means any class that contains
 *   a method with the same name and signature.
 */
def testAccuracy(model: { def predict(f: Vector): Double })(data: RDD[LabeledPoint]): Double = {
    val prediction = data.map { lp =>
        val pred = model.predict(lp.features)
        (lp.label, pred)
    }
    
    import org.apache.spark.mllib.evaluation._
    val metrics = new MulticlassMetrics(prediction)
    val cm = metrics.confusionMatrix
    
    val correct = (0 until cm.numCols).map { i => cm(i, i) }.sum
    val total = cm.toArray.sum
    
    correct / total
}

In [1]:
println {
    testAccuracy(trainedModel)(test)
}


0.8922071861875875

Regularisation


In [9]:
val logreg = new LogisticRegressionWithLBFGS().setNumClasses(10)
logreg.optimizer.setRegParam(0.1)

val trainedModel = logreg.run(train)

In [2]:
println {
    testAccuracy(trainedModel)(test)
}


0.8922071861875875

SVM


In [11]:
def rocauc(model: { def predict(f: Vector): Double })(data: RDD[LabeledPoint]): Double = {
    val prediction = data.map { lp =>
        val pred = model.predict(lp.features)
        (lp.label, pred)
    }
    
    import org.apache.spark.mllib.evaluation._
    val metrics = new BinaryClassificationMetrics(prediction)
    metrics.areaUnderROC()
}

In [12]:
val digit0_train = train.map { lp => lp.copy(label = if (lp.label == 0) 1.0 else 0.0) }
val digit0_test = test.map { lp => lp.copy(label = if (lp.label == 0) 1.0 else 0.0) }

digit0_train.persist()
digit0_test.persist()

val svm = SVMWithSGD.train(digit0_train, numIterations=250, stepSize=0.1, regParam=0.1, miniBatchFraction=0.1)

In [13]:
rocauc(svm)(digit0_test)


Out[13]:
0.9478939639502503

In [14]:
testAccuracy(svm)(digit0_test)


Out[14]:
0.9833177788147457

Random Forest


In [15]:
import org.apache.spark.mllib.tree.RandomForest

In [16]:
val rf = RandomForest.trainClassifier(input = train, numClasses=10,
    categoricalFeaturesInfo=Map.empty[Int, Int],
    numTrees=5,
    featureSubsetStrategy="log2", impurity ="gini", maxDepth=25, maxBins = 100, seed=333)

In [17]:
testAccuracy(rf)(test)


Out[17]:
0.886724218385441

Gradient Boosting


In [18]:
import org.apache.spark.mllib.tree._
import org.apache.spark.mllib.tree.configuration._

In [19]:
val boostingStrategy = BoostingStrategy.defaultParams("Classification")

boostingStrategy.numIterations = 2
boostingStrategy.treeStrategy.numClasses = 2
boostingStrategy.treeStrategy.maxDepth = 25

boostingStrategy.treeStrategy.categoricalFeaturesInfo = Map.empty[Int, Int]

println {
    boostingStrategy
}

In [20]:
val gbt = GradientBoostedTrees.train(train, boostingStrategy)

In [21]:
testAccuracy(gbt)(test)


Out[21]:
0.9540873460246361

GBT regression


In [22]:
val boostingStrategy = BoostingStrategy.defaultParams("Regression")

boostingStrategy.numIterations = 2
boostingStrategy.treeStrategy.maxDepth = 25

boostingStrategy.treeStrategy.categoricalFeaturesInfo = Map.empty[Int, Int]

println {
    boostingStrategy
}

In [23]:
val gbt = GradientBoostedTrees.train(train, boostingStrategy)

In [24]:
testAccuracy(gbt)(test)


Out[24]:
0.15212319178721417